## ---- prov-demographics ----

library(stringr)
library(readr)
library(readr)
library(dplyr)
library(dataverse)
library(SPARQL)
library(dataverse)
library(rgdal) 
library(sp)

load('replication_file_02_meetup_event_venue.RData')
load('replication_file_03_spatial_analysis.RData')
load('replication_file_04_geocode_venue_italy.RData')

# Meetup @ provincial level (participation and unemployment)
unemployment_prov <- 
  read.csv("secondary_data/DCCV_TAXDISOCCU1_06072018063709606.csv")

unemployment_prov <-
  unemployment_prov[unemployment_prov$Sesso == 'totale' & 
                      unemployment_prov$ETA1 == 'Y_GE15',]

pro_listed <- list()
for (y in 2005:2017) {
  this_names <- 
    as.character(unemployment_prov$Territorio[unemployment_prov$TIME == y])
  this_names <- gsub(" / (.*)$", "", this_names)
  pro_listed[[as.character(y)]] <- str_trim(this_names)
  names(pro_listed[[as.character(y)]]) <- 
    unemployment_prov$ITTER107[unemployment_prov$TIME == y]
}

shp_files <- 
  list.files('secondary_data/istat_prov_shapefiles', 
             recursive = T, pattern = 'shp',
             full.names = T)

prepareName <- function(str) {
  return(gsub("/(.*)$", "", str))
}

province_sp_list <- list() 
for (y in 2005:2017) {
  print(y)
  if (y %in% c(2006, 2007)) {
    province_sp_list[[as.character(y)]] <- 
      province_sp_list[['2005']]
    next
  }
  this_file <- shp_files[grepl(y, shp_files)]
  this_dns <- gsub("/Prov[A-Za-z0-9_]+.shp$", "", this_file)
  this_layer <- 
    gsub("\\.shp", "", gsub("^(.*)Prov[A-Za-z0-9_]+/", "", this_file))
  province_sp_list[[as.character(y)]] <- 
    readOGR(this_dns, this_layer)
  if (!y %in% 2015:2017) {
    province_sp_list[[as.character(y)]]$str_name <- 
      prepareName(province_sp_list[[as.character(y)]]$DEN_PROV)
  } else {
    province_sp_list[[as.character(y)]]$str_name <- 
      prepareName(province_sp_list[[as.character(y)]]$DEN_PCM)
  }
}


for (y in 2005:2017) {
  province_sp_list[[as.character(y)]]$str_name[
    province_sp_list[[as.character(y)]]$str_name == 'Bolzano'] <- 
    "Provincia Autonoma Bolzano"
  province_sp_list[[as.character(y)]]$str_name[
    province_sp_list[[as.character(y)]]$str_name == 'Trento'] <- 
    "Provincia Autonoma Trento"
  
  province_sp_list[[as.character(y)]]$str_name[
    province_sp_list[[as.character(y)]]$str_name == "Forli'-Cesena"] <- 
    "Forlì-Cesena"
  province_sp_list[[as.character(y)]]$str_name[
    province_sp_list[[as.character(y)]]$str_name == "Massa Carrara"] <- 
    "Massa-Carrara"
  
  if (y > 2014) {
    province_sp_list[[as.character(y)]]$str_name[
      province_sp_list[[as.character(y)]]$str_name == 'Aosta'] <- 
      "Valle d'Aosta"
  }
  
  province_sp_list[[as.character(y)]]$ITTER107 <-
    names(pro_listed[[as.character(y)]])[
      match(province_sp_list[[as.character(y)]]$str_name,
            pro_listed[[as.character(y)]])]
  
  this_dat <- unemployment_prov[unemployment_prov$TIME == y,]
  
  province_sp_list[[as.character(y)]]$unemployment <- 
    this_dat$Value[
      match(province_sp_list[[as.character(y)]]$ITTER107, this_dat$ITTER107)]
}

## Add employment
istat_prov_employment <- 
  read_csv("secondary_data/DCCV_TAXOCCU1_09072018060529303.csv")
istat_prov_employment <- 
  istat_prov_employment[istat_prov_employment$Sesso=='totale' &
                          istat_prov_employment$ETA1 == 'Y_GE15',]

for (y in 2005:2017) {
  this_dat <- istat_prov_employment[istat_prov_employment$TIME == y,]
  province_sp_list[[as.character(y)]]$employment <- 
    this_dat$Value[
      match(province_sp_list[[as.character(y)]]$ITTER107,this_dat$ITTER107)]
}


## Add population
for (y in 2005:2017) {
  print(y)
  this_pop_data <-
    read_csv(sprintf("secondary_data/istat_prov_pop/province%s.zip", y), 
             locale = locale(encoding = "WINDOWS-1252"), 
             skip = 1)
  names(this_pop_data)[3] <- 'age'
  this_pop_data <- 
    this_pop_data %>%
    dplyr::group_by(`Codice Provincia`, `Descrizione Provincia`) %>%
    dplyr::summarize(pop = sum(`Totale Maschi`) + sum(`Totale Femmine`),
                     pop_over_65 = sum(`Totale Maschi`[age>=65]) + 
                       sum(`Totale Femmine`[age>=65]))
  this_pop_data$`Codice Provincia` <- as.numeric(this_pop_data$`Codice Provincia`)
  province_sp_list[[as.character(y)]]$population <- 
    this_pop_data$pop[match(province_sp_list[[as.character(y)]]$COD_PROV, 
                            this_pop_data$`Codice Provincia`)]
  province_sp_list[[as.character(y)]]$population_over_65 <- 
    this_pop_data$pop_over_65[match(province_sp_list[[as.character(y)]]$COD_PROV, 
                                    this_pop_data$`Codice Provincia`)]
}

province_sp_list[['2017']]$population[
  province_sp_list[['2017']]$DEN_PROV == 'Sud Sardegna'] <- 
  98623 + 126324


## Add foreign population
f <- get_file("istat_provincial_foreign_residents.csv", 
              "doi:10.7910/DVN/YM7IWP")
istat_provincial_foreign_residents <- 
  read_csv(f, locale = locale(encoding = "WINDOWS-1252"))
istat_provincial_foreign_residents <-
  istat_provincial_foreign_residents[istat_provincial_foreign_residents$Sesso == 'totale', ]

library(SPARQL)
query <- 'SELECT ?country ?countryLabel WHERE {
?country wdt:P31 wd:Q6256.
?country wdt:P30 wd:Q15
SERVICE wikibase:label {
bd:serviceParam wikibase:language "it" .
}
}'
res <- SPARQL('https://query.wikidata.org/sparql', query)
african_country_labels <- res$results$countryLabel
african_country_labels <- gsub("\"|@it", "", african_country_labels)
istat_provincial_foreign_residents$africa <- 
  istat_provincial_foreign_residents$`Paese di cittadinanza` %in% african_country_labels

istat_provincial_foreign_residents$africa[
  istat_provincial_foreign_residents$`Paese di cittadinanza` %in% 
    c("Benin (ex Dahomey)", "Burkina Faso (ex Alto Volta)", 
      "Centrafricana, Repubblica", "Congo (Repubblica del)",
      "Congo, Repubblica democratica del (ex Zaire)", 
      "Guinea equatoriale", "São Tomé e Principe",
      "Sud Africa", "Sud Sudan, Repubblica del", 
      "Zimbabwe (ex Rhodesia)")] <- 
  TRUE

istat_provincial_foreign_residents <- 
  istat_provincial_foreign_residents %>%
  group_by(Anno, Territorio) %>%
  summarize(foreignpop = sum(`0`),
            foreignpop_africa = sum(`0`[africa == TRUE]))

### Add 2017
istat_foreign_pop_2017 <- 
  read_csv("secondary_data/DCIS_POPSTRCIT1_09072018042403842.csv")
istat_foreign_pop_2017 <- 
  istat_foreign_pop_2017[istat_foreign_pop_2017$Sesso == 'totale',]
istat_foreign_pop_2017 <- 
  istat_foreign_pop_2017[istat_foreign_pop_2017$`Paese di cittadinanza` != 'Mondo',]

istat_foreign_pop_2017$africa <- 
  grepl("Africa", istat_foreign_pop_2017$`Paese di cittadinanza`)

istat_foreign_pop_2017 <- 
  istat_foreign_pop_2017 %>%
  group_by(Territorio, Anno = TIME) %>%
  summarize(foreignpop = sum(Value),
            foreignpop_africa = sum(Value[africa==TRUE]))

istat_provincial_foreign_residents <- 
  rbind(istat_provincial_foreign_residents, 
        istat_foreign_pop_2017)


all_str_names <- character()
for (y in 2005:2017) {
  print(y)
  all_str_names <- unique(c(all_str_names, province_sp_list[[as.character(y)]]$str_name))
}
all_str_names[!all_str_names %in% 
                unique(istat_provincial_foreign_residents$Territorio)]
istat_provincial_foreign_residents$Territorio[
  istat_provincial_foreign_residents$Territorio == 
    "Valle d'Aosta / Vallée d'Aoste"] <- 
  "Valle d'Aosta"
istat_provincial_foreign_residents$Territorio[
  istat_provincial_foreign_residents$Territorio == 
    "Bolzano / Bozen"] <- 
  "Provincia Autonoma Bolzano"
istat_provincial_foreign_residents$Territorio[
  istat_provincial_foreign_residents$Territorio %in% 
    c("Carbonia-Iglesias", "Medio Campidano") &
    istat_provincial_foreign_residents$Anno == 2017] <- "Sud Sardegna"

for (y in 2005:2017) {
  print(y)
  this_dat <- istat_provincial_foreign_residents[
    istat_provincial_foreign_residents$Anno == y,]
  province_sp_list[[as.character(y)]]$foreignpop <- 
    this_dat$foreignpop[
      match(province_sp_list[[as.character(y)]]$str_name, 
            this_dat$Territorio)]
  province_sp_list[[as.character(y)]]$foreignpop_africa <- 
    this_dat$foreignpop_africa[
      match(province_sp_list[[as.character(y)]]$str_name, 
            this_dat$Territorio)]
}

### 2010 
#### Milano / Monza e della Brianza
value1 <- province_sp_list[['2012']]$foreignpop[
  province_sp_list[['2012']]$str_name=="Milano"]
value2 <- province_sp_list[['2012']]$foreignpop[
  province_sp_list[['2012']]$str_name=="Monza e della Brianza"]
prop1 <- value1 / (value1 + value2)
prop2 <- value2 / (value1 + value2)

province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Monza e della Brianza"] <- 
  round(province_sp_list[['2010']]$foreignpop[
    province_sp_list[['2010']]$str_name=="Milano"]*prop2 , 0)
province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Milano"] <- 
  round(province_sp_list[['2010']]$foreignpop[
    province_sp_list[['2010']]$str_name=="Milano"]*prop1 , 0)

value1 <- province_sp_list[['2012']]$foreignpop_africa[
  province_sp_list[['2012']]$str_name=="Milano"]
value2 <- province_sp_list[['2012']]$foreignpop_africa[
  province_sp_list[['2012']]$str_name=="Monza e della Brianza"]
prop1 <- value1 / (value1 + value2)
prop2 <- value2 / (value1 + value2)

province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Monza e della Brianza"] <- 
  round(province_sp_list[['2010']]$foreignpop_africa[
    province_sp_list[['2010']]$str_name=="Milano"]*prop2 , 0)
province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Milano"] <- 
  round(province_sp_list[['2010']]$foreignpop_africa[
    province_sp_list[['2010']]$str_name=="Milano"]*prop1 , 0)

#### Ascoli Piceno / Fermo
value1 <- province_sp_list[['2011']]$foreignpop[
  province_sp_list[['2011']]$str_name=="Ascoli Piceno"]
value2 <- province_sp_list[['2011']]$foreignpop[
  province_sp_list[['2011']]$str_name=="Fermo"]
prop1 <- value1 / (value1 + value2)
prop2 <- value2 / (value1 + value2)

province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Fermo"] <- 
  round(province_sp_list[['2010']]$foreignpop[
    province_sp_list[['2010']]$str_name=="Ascoli Piceno"]*prop2 , 0)
province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Ascoli Piceno"] <- 
  round(province_sp_list[['2010']]$foreignpop[
    province_sp_list[['2010']]$str_name=="Ascoli Piceno"]*prop1 , 0)

value1 <- province_sp_list[['2011']]$foreignpop_africa[
  province_sp_list[['2011']]$str_name=="Ascoli Piceno"]
value2 <- province_sp_list[['2011']]$foreignpop_africa[
  province_sp_list[['2011']]$str_name=="Fermo"]
prop1 <- value1 / (value1 + value2)
prop2 <- value2 / (value1 + value2)

province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Fermo"] <- 
  round(province_sp_list[['2010']]$foreignpop_africa[
    province_sp_list[['2010']]$str_name=="Ascoli Piceno"]*prop2 , 0)
province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Ascoli Piceno"] <- 
  round(province_sp_list[['2010']]$foreignpop_africa[
    province_sp_list[['2010']]$str_name=="Ascoli Piceno"]*prop1 , 0)

#### Bari + Foggia / Barletta-Andria-Trani
value1 <- province_sp_list[['2011']]$foreignpop[
  province_sp_list[['2011']]$str_name=="Bari"]
value2 <- province_sp_list[['2011']]$foreignpop[
  province_sp_list[['2011']]$str_name=="Foggia"]
value3 <- province_sp_list[['2011']]$foreignpop[
  province_sp_list[['2011']]$str_name=="Barletta-Andria-Trani"]
prop1 <- value1 / (value1 + value2 + value3)
prop2 <- value2 / (value1 + value2 + value3)
prop3 <- value3 / (value1 + value2 + value3)

value2010 <- 
  province_sp_list[['2010']]$foreignpop[
    province_sp_list[['2010']]$str_name=="Bari"] +
  province_sp_list[['2010']]$foreignpop[
    province_sp_list[['2010']]$str_name=="Foggia"]

province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Bari"] <- 
  round(value2010*prop1,0)
province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Foggia"] <- 
  round(value2010*prop2,0)
province_sp_list[['2010']]$foreignpop[
  province_sp_list[['2010']]$str_name=="Barletta-Andria-Trani"] <- 
  round(value2010*prop3,0)

value1 <- province_sp_list[['2011']]$foreignpop_africa[
  province_sp_list[['2011']]$str_name=="Bari"]
value2 <- province_sp_list[['2011']]$foreignpop_africa[
  province_sp_list[['2011']]$str_name=="Foggia"]
value3 <- province_sp_list[['2011']]$foreignpop_africa[
  province_sp_list[['2011']]$str_name=="Barletta-Andria-Trani"]
prop1 <- value1 / (value1 + value2 + value3)
prop2 <- value2 / (value1 + value2 + value3)
prop3 <- value3 / (value1 + value2 + value3)

value2010 <- 
  province_sp_list[['2010']]$foreignpop_africa[
    province_sp_list[['2010']]$str_name=="Bari"] +
  province_sp_list[['2010']]$foreignpop_africa[
    province_sp_list[['2010']]$str_name=="Foggia"]

province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Bari"] <- round(value2010*prop1,0)
province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Foggia"] <- round(value2010*prop2,0)
province_sp_list[['2010']]$foreignpop_africa[
  province_sp_list[['2010']]$str_name=="Barletta-Andria-Trani"] <- 
  round(value2010*prop3,0)


### Check
for (y in 2005:2017) {
  print(y)
  print(any(is.na(province_sp_list[[as.character(y)]]$foreignpop_africa)))
}

## Measure meetup density
meetup_event_it <- 
  meetup_event[meetup_event$venue %in% meetup_venue_it$venue_id,]
for (y in 2006:2017) {
  print(y)
  week_seq <- seq(from = as.Date(sprintf("%s-01-01",y)),
                  to = as.Date(sprintf("%s-01-01",y+1))-1,
                  by = "1 week")
  these_events <- subset(meetup_event_it, yes_rsvp_count > 0 & grepl(y, time))
  these_venues <- meetup_venue_it[meetup_venue_it$venue_id %in% these_events$venue,]
  these_events$week <- cut(as.Date(these_events$time), breaks = week_seq)
  res <- over(these_venues, province_sp_list[[as.character(y)]])
  these_venues$ITTER107 <- res$ITTER107 
  if(any(is.na(these_venues$ITTER107))) stop
  these_events$ITTER107 <- 
    these_venues$ITTER107[match(these_events$venue, these_venues$venue_id)]
  if(any(is.na(these_events$ITTER107))) stop
  count_tbl <- as.data.frame(table(these_events$ITTER107))
  province_sp_list[[as.character(y)]]$n_meetup_events <- NA
  province_sp_list[[as.character(y)]]$n_meetup_events <- 
    count_tbl$Freq[match(province_sp_list[[as.character(y)]]$ITTER107, count_tbl$Var1)]
  province_sp_list[[as.character(y)]]$n_meetup_events[
    is.na(province_sp_list[[as.character(y)]]$n_meetup_events)] <- 0
  these_meetup_7days <-
    these_events %>%
    dplyr::group_by(ITTER107) %>%
    dplyr::summarize(meetup_7days = length(unique(week)))
  province_sp_list[[as.character(y)]]$meetup_7days <- 
    these_meetup_7days$meetup_7days[
      match(province_sp_list[[as.character(y)]]$ITTER107, 
            these_meetup_7days$ITTER107)]
  province_sp_list[[as.character(y)]]$meetup_7days[
    is.na(province_sp_list[[as.character(y)]]$meetup_7days)] <- 0
}

# Meetup freq pre election
## 2013
y <- 2013
election_date_2013 <- as.Date("2013-02-24")
week_seq <- seq(from = election_date_2013 - 6 * 30,
                to = election_date_2013,
                by = 'week')
these_events <- 
  subset(meetup_event_it, yes_rsvp_count > 0 & 
           as.Date(time) > min(week_seq) & 
           as.Date(time) < max(week_seq))
these_venues <- 
  meetup_venue_it[meetup_venue_it$venue_id %in% these_events$venue,]
these_events$week <- 
  cut(as.Date(these_events$time), breaks = week_seq)
res <- over(these_venues, province_sp_list[[as.character(y)]])
these_venues$ITTER107 <- res$ITTER107 
these_events$ITTER107 <- 
  these_venues$ITTER107[match(these_events$venue, these_venues$venue_id)]
meetup2013_7days <-
  these_events %>%
  dplyr::group_by(ITTER107) %>%
  dplyr::summarize(meetup_7days = length(unique(week)))

## 2017
y <- 2017
election_date_2018 <- as.Date("2018-03-04")
week_seq <- seq(from = election_date_2018 - 6 * 30,
                to = election_date_2018,
                by = 'week')
these_events <- 
  subset(meetup_event_it, yes_rsvp_count > 0 & 
           as.Date(time) > min(week_seq) & as.Date(time) < max(week_seq))
these_venues <- 
  meetup_venue_it[meetup_venue_it$venue_id %in% these_events$venue,]
these_events$week <- 
  cut(as.Date(these_events$time), breaks = week_seq)
res <- over(these_venues, province_sp_list[[as.character(y)]])
these_venues$ITTER107 <- 
  res$ITTER107 
these_events$ITTER107 <- 
  these_venues$ITTER107[match(these_events$venue, these_venues$venue_id)]
meetup2018_7days <-
  these_events %>%
  dplyr::group_by(ITTER107) %>%
  dplyr::summarize(meetup_7days = length(unique(week)))

## Meetup

for(i in 2:13) {
  province_sp_list[[i]]$n_meetup_events_per_10k <- 
    province_sp_list[[i]]$n_meetup_events /
    (province_sp_list[[i]]$population/10000)
}


## Assign region
load('replication_file_05_regione2018_sp_sf.RData')
for (y in 2005:2017) {
  print(y)
  this_crs <- 
    CRS(proj4string(province_sp_list[[as.character(y)]]))
  these_coords <- 
    coordinates(province_sp_list[[as.character(y)]])
  these_coords <- SpatialPointsDataFrame(these_coords, 
                                         data = province_sp_list[[as.character(y)]]@data,
                                         proj4string = this_crs)
  res <- over(these_coords, spTransform(regione2018_sp, this_crs))
  province_sp_list[[as.character(y)]]$region <- 
    res$DEN_REG
  
  province_sp_list[[as.character(y)]]$region[
    province_sp_list[[as.character(y)]]$str_name == 'Rimini'] <- 
    "Emilia-Romagna"
  province_sp_list[[as.character(y)]]$region[
    province_sp_list[[as.character(y)]]$str_name == 'Cagliari'] <- 
    "Sardegna"
}

## Check
for (y in 2005:2017) {
  print(y)
  print(any(is.na(province_sp_list[[as.character(y)]]$population)))
  print(any(is.na(province_sp_list[[as.character(y)]]$n_meetup_events)))
  # print(any(is.na(province_sp_list[[as.character(y)]]$region)))
  print(province_sp_list[[as.character(y)]]$str_name[
    is.na(province_sp_list[[as.character(y)]]$region)])
}

# Add income
base_url <- 
  'http://www1.finanze.gov.it/finanze3/analisi_stat/v_4_0_0/contenuti/Redditi_e_principali_variabili_IRPEF_su_base_comunale_CSV_%s.zip'


for (y in 2008:2016) {
  print(y)
  td <- tempdir()
  # create the placeholder file
  tf <- tempfile(tmpdir=td, fileext=".zip")
  # download into the placeholder file
  download.file(sprintf(base_url, y), tf)
  fname = unzip(tf, list=TRUE)$Name[1]
  unzip(tf, files=fname, exdir=td, overwrite=TRUE)
  fpath = file.path(td, fname)
  this_dat <- read.csv(fpath, header=TRUE, row.names=NULL, sep=";", 
                       stringsAsFactors=FALSE, 
                       na.strings = "")
  freq_col_i <- 
    which(grepl("Reddito.complessivo", colnames(this_dat)) & 
            grepl("Frequenza", colnames(this_dat)))
  amm_col_i <- 
    which(grepl("Reddito.complessivo", colnames(this_dat)) & 
            grepl("Ammontare", colnames(this_dat)))
  
  tot_df <- 
    data.frame(SIGLA = this_dat$Sigla.Provincia,
               n_taxpayers = apply(this_dat[,freq_col_i], 1, sum, na.rm = T),
               taxable_income = apply(this_dat[,amm_col_i], 1, sum,  na.rm = T),
               stringsAsFactors = F)
  
  tot_df <- tot_df[!is.na(tot_df$SIGLA),]
  tot_df <- 
    tot_df %>%
    dplyr::group_by(SIGLA) %>%
    summarize_all(sum)
  
  print(head(tot_df))
  
  province_sp_list[[as.character(y)]] <- 
    merge(province_sp_list[[as.character(y)]],
          tot_df, 
          by = 'SIGLA')
  
}

save(province_sp_list, file = "replication_file_07_province_sp_list.RData")
